8 Differential Expression

We then compare the protein expression between strains for each pairwise comparison (28 unique pairwise combinations)

To detect differentially expressed genes, we use the limma analysis on normalized protein expression.

8.1 Volcano plots

First, we can look at the volcano plots (log-foldchange vs qvalue) for each unique pairwise comparison.

#ind_na_rows  = find_na_rows(int_norm,as.indices = T)
df_imputed = tibble( uniprot = int_norm$uniprot, is_imputed = rowSums( is.na(int_norm))>0)
#int_norm_bpca$imputed = rowSums( is.na(int_norm))>0
volcPlot(INPUT=int_norm_bpca, IMPUTED=df_imputed,MIN_LFC=2, MIN_PVAL=0.01, WHICH='both', TOPN = 20)
#> Joining, by = c("ID", "pValue", "qValue", "EffectSize",
#> "comparison", "sig", "log10_qvalue", "SGD", "ORF",
#> "UNIPROT", "GENENAME", "is_imputed")

8.2 Differentially expressed genes

# Without NA
volcano_data =  get_volcano_data(input_data=int_norm_nona, which='both',
                            min_lfc=2, min_pval=0.01, topn = 20)
df_limma = bind_rows(volcano_data) %>% as_tibble()
dfe_nona = subset(df_limma,sig!='Non significant')
N_DFE_NONA = nrow(dfe_nona)
NPROT_DFE_NONA =  n_distinct(dfe_nona$ID)

# With imputed expression to replace NA
dfe =  get_volcano_data(input_data=int_norm_bpca, which='both',
                            min_lfc=2, min_pval=0.01, topn = 20) %>% 
                bind_rows %>% as_tibble() %>% filter(sig!='Non significant')

N_DFE_IMPUTE = nrow(dfe)
NPROT_DFE_IMPUTE =  n_distinct(dfe$ID)

down = dfe %>% group_by(ID) %>% dplyr::filter(sig=='Downregulated') %>%
       summarize( strains_down = paste0(comparison,collapse=' '),
                  down_AMH = str_count(strains_down,'AMH-'),
                  down_BAN = str_count(strains_down,'BAN-'),
                  down_BED = str_count(strains_down,'BED-'),
                  down_BPL = str_count(strains_down,'BPL-'),
                  down_BTT = str_count(strains_down,'BTT-'),
                  down_CMP = str_count(strains_down,'CMP-'),
                  down_CPI = str_count(strains_down,'CPI-'),
                  down_CQC = str_count(strains_down,'CQC-'))
up = dfe %>% group_by(ID) %>% dplyr::filter(sig=='Upregulated') %>%  
      summarize( strains_up = paste0(comparison,collapse=' '),
                  up_AMH = str_count(strains_up,'AMH-'),
                  up_BAN = str_count(strains_up,'BAN-'),
                  up_BED = str_count(strains_up,'BED-'),
                  up_BPL = str_count(strains_up,'BPL-'),
                  up_BTT = str_count(strains_up,'BTT-'),
                  up_CMP = str_count(strains_up,'CMP-'),
                  up_CPI = str_count(strains_up,'CPI-'),
                  up_CQC = str_count(strains_up,'CQC-'))
# get_dfe(int_norm, MIN_LFC=2, MIN_PVAL=0.01,  WHICH='both', TOPN = 20) %>% remove_rownames() %>% 
#   dplyr::left_join(sc_identifiers, by=c('ID'='UNIPROT'))

# Number of times a hit is differentially expressed
df_dfe = dfe %>% left_join(janitor::tabyl(dfe,ID,sig)) %>%
         left_join(down) %>% left_join(up) %>%
         rename(uniprot=ID) %>% 
         group_by(uniprot,comparison) %>% 
        mutate( n_strains_up = sum(c_across(starts_with('up_')) !=0 ),
                n_strains_down = sum(c_across(starts_with('down_'))!=0)) %>%
        replace_na(list(n_strains_up=0,n_strains_down=0)) %>%
        left_join(evo_yeast, by=c('uniprot'='UNIPROT')) 
#> Joining, by = "ID"
#> Joining, by = "ID"
#> Joining, by = "ID"
  

df_dfe_annot = df_dfe %>%
          left_join(sc_annotation_orf,by=c('uniprot'='UNIPROT')) %>%
  mutate(uniprot_link = paste0("<a href='https://www.uniprot.org/uniprot/",uniprot,"'>",uniprot,"</a>"),
         sgd_link = paste0("<a href='https://www.yeastgenome.org/locus/",SGD,"'>",SGD,"</a>"),
         regulated = Downregulated+Upregulated) %>% 
  dplyr::relocate(uniprot,uniprot_link,sgd_link,regulated,Downregulated,Upregulated, 
                  GENENAME,ORF,PNAME,'FUNCTION','BIOPROCESS_all','ORTHO','OTHER')

We get 225 genes differentially expressed when excluding genes with missing expression in any samples.

On average, each gene is detected in 3.4888889 unique pairwise strain comparison.

After imputation of missing expression with bpca (Bayesian missing value imputation), we get 230 more genes differentially expressed (n=455).

On average, each gene is detected in 3.6351648 unique pairwise strain comparison.

The following table shows the list of differentially expressed genes across all unique pairwise comparison, with annotations data and conservation/snp information.

library(kableExtra)
library(formattable)
library(DT)

ft_dt = df_dfe_annot %>% 
  formattable(
    list(
      `Downregulated` = color_tile("white", "red"),
      `Upregulated` = color_tile("white", "blue"),
      `regulated` = color_tile("white", "gray")
    )
) %>% as.datatable(
        options = list(
            fixedHeader=T,
            paging = TRUE, pageLength = 20,  ## paginate the output and #rows for each page
            scrollY = TRUE,   ## enable scrolling on X/Y axis
            #autoWidth = TRUE, ## use smart column width handling
            server = FALSE,   ## use client-side processing
            dom = 'Bfrtip', buttons = list('csv', 'excel', list(extend = 'colvis')),
            fixedColumns = list(leftColumns = 1),
            columnDefs = list(list(width = '50px', visible=TRUE, targets = "_all"))
          ),
  extensions = c('FixedHeader','FixedColumns','Buttons'),
  selection = 'single',           ## enable selection of a single row
  filter = 'bottom',              ## include column filters at the bottom
  rownames = FALSE,               ## don't show row numbers/names
  width = NULL, 
  height = NULL,
  caption = NULL
) %>% 
   formatStyle(columns = 1:30, target= 'row',lineHeight='100%', `font-size` = '12px')

ft_dt

8.3 Functional map for diffentially expressed genes

library(treemap)
library(d3Tree)
treemap(df_dfe_annot, index=c("BIOPROCESS_all", "comparison"), vSize='regulated', vColor="EffectSize", type="value") 
treemap(df_dfe_annot, index=c("BIOPROCESS_all", "GENENAME"), vSize="regulated", vColor="EffectSize", type="value",)

8.4 Heatmap of expression differences

dat_scaled = int_scaled_strains %>% as.data.frame() %>% rownames_to_column('uniprot') 

# Transpose the matrix to calculate distance between experiments, row-wise
d_pair <- dat_scaled[,-1] %>% t() %>%
  dist(.,method = "euclidean", diag = FALSE, upper = FALSE)
# Calculate the distance between proteins row-wise 
d_prot <- dat_scaled %>% dplyr::filter( uniprot %in% dfe$ID) %>% dplyr::select(-uniprot) %>% dist(.,method = "euclidean", diag = FALSE, upper = FALSE)

dfe_lfc = bind_rows(volcano_data) %>% 
          pivot_wider(id_cols=ID, names_from = 'comparison', values_from = 'EffectSize') %>% 
          dplyr::filter(ID %in% dfe$ID) %>% left_join(sc_identifiers,by=c('ID'='UNIPROT')) %>%
          column_to_rownames('GENENAME') %>% dplyr::select(-ORF,-ID,-SGD)

# Heatmap of differentially expressed genes
p_dfe= pheatmap::pheatmap(dfe_lfc,fontsize = 5,cutree_rows = 10,cellwidth = 5,cellheight =5,border_color = NA) %>% 
        ggplotify::as.ggplot()

dfe_exp = dat_scaled %>% dplyr::filter( uniprot %in% dfe$ID) %>% 
          left_join(sc_identifiers,by=c('uniprot'='UNIPROT'))  %>% 
          dplyr::filter(!duplicated(GENENAME)) %>%
          mutate(GENENAME=if_na(GENENAME,uniprot))%>%
          column_to_rownames(var = 'GENENAME')

p_dfe_exp=pheatmap::pheatmap(dfe_exp %>% dplyr::select(-ORF,-uniprot,-SGD) ,
                             fontsize = 8,cellwidth = 5,cellheight =5,border_color = NA,treeheight_col = 10)

textcol <- "grey40"
# further modified ggplot
# p_exp <- ggplot(dfe_exp%>%pivot_longer(where(is.numeric), names_to = 'strain', values_to='exp') %>% rownames_to_column('GENENAME'), 
#             aes(x=strain, y=GENENAME, fill=exp))+
#   geom_tile(colour="white", size=0.1)+
#   labs(x="", y="")+
#   scale_fill_gradientn(colours = colorRampPalette(rev(brewer.pal(n = 7, name =
#   "RdYlBu")))(100))+
#   #scale_fill_manual(values=c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da"), na.value = "grey90")+
#   theme_grey(base_size=10)+
#   theme(legend.position="right", legend.direction="vertical",
#         legend.title=element_text(colour=textcol),
#         legend.margin=margin(grid::unit(0, "cm")),
#         legend.text=element_text(colour=textcol, size=7, face="bold"),
#         legend.key.height=grid::unit(0.8, "cm"),
#         legend.key.width=grid::unit(0.2, "cm"),
#         axis.text.x=element_text(size=8, angle=90,colour=textcol,hjust = 1),
#         axis.text.y=element_text(vjust=0.2, colour=textcol),
#         axis.ticks=element_line(size=0.4),
#         plot.background=element_blank(),
#         panel.border=element_blank(),
#         plot.margin=margin(0.7, 0.4, 0.1, 0.2, "cm"),
#         plot.title=element_text(colour=textcol, hjust=0, size=14, face="bold")
#         ) + coord_fixed(ratio = 1)
# p_exp